{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 对活动数据进行分析\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导入工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "No module named utils",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-429b65f58a7c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;31m#event的特征需要编码\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mFeatureEng\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     10\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnormalize\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[1;31m#相似度/距离\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mImportError\u001b[0m: No module named utils"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import scipy.sparse as ss\n",
    "import scipy.io as sio\n",
    "\n",
    "#保存数据\n",
    "import cPickle\n",
    "\n",
    "#event的特征需要编码\n",
    "from utils import FeatureEng\n",
    "from sklearn.preprocessing import normalize\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 统计活动数目"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据，并统计有多少不同的events\n",
    "#其实EDA.ipynb中用read_csv已经统计过了\n",
    "lines = 0\n",
    "fin = open(\"events.csv\", 'rb')\n",
    "#找到用C/C++的感觉了\n",
    "#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count\n",
    "fin.readline() # skip header，列名行\n",
    "for line in fin:\n",
    "    cols = line.strip().split(\",\")\n",
    "    lines += 1\n",
    "fin.close()\n",
    "\n",
    "print(\"number of records :%d\" % lines)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取之前算好的测试集和训练集中出现过的活动\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取训练集和测试集中出现过的活动列表\n",
    "eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "n_events = len(eventIndex)\n",
    "\n",
    "print(\"number of events in train & test :%d\" % n_events)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 处理events.csv --> 特征编码、活动之间的相似度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FE = FeatureEng()\n",
    "\n",
    "fin = open(\"events.csv\", 'rb')\n",
    "\n",
    "#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count\n",
    "fin.readline() # skip header\n",
    "\n",
    "#start_time, city, state, zip, country, lat, and lng\n",
    "eventPropMatrix = ss.dok_matrix((n_events, 7))\n",
    "\n",
    "#词频特征\n",
    "eventContMatrix = ss.dok_matrix((n_events, 101))\n",
    "\n",
    "for line in fin.readlines():\n",
    "    cols = line.strip().split(\",\")\n",
    "    eventId = str(cols[0])\n",
    "    \n",
    "    if eventIndex.has_key(eventId):  #在训练集或测试集中出现\n",
    "        i = eventIndex[eventId]\n",
    "  \n",
    "        #event的特征编码，这里只是简单处理，其实开始时间，地点等信息很重要\n",
    "        eventPropMatrix[i, 0] = FE.getJoinedYearMonth(cols[2]) # start_time\n",
    "        eventPropMatrix[i, 1] = FE.getFeatureHash(cols[3]) # city\n",
    "        eventPropMatrix[i, 2] = FE.getFeatureHash(cols[4]) # state\n",
    "        eventPropMatrix[i, 3] = FE.getFeatureHash(cols[5]) # zip\n",
    "        eventPropMatrix[i, 4] = FE.getFeatureHash(cols[6]) # country\n",
    "        eventPropMatrix[i, 5] = FE.getFloatValue(cols[7]) # lat\n",
    "        eventPropMatrix[i, 6] = FE.getFloatValue(cols[8]) # lon\n",
    "        \n",
    "        #词频\n",
    "        for j in range(9, 110):\n",
    "            eventContMatrix[i, j-9] = cols[j]\n",
    "fin.close()\n",
    "\n",
    "#用L2模归一化,Kmeans聚类基于L2距离\n",
    "eventPropMatrix = normalize(eventPropMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventPropMatrix\", eventPropMatrix)\n",
    "\n",
    "#词频，可以考虑我们用这部分特征进行聚类，得到活动的genre\n",
    "eventContMatrix = normalize(eventContMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventContMatrix\", eventContMatrix)\n",
    "\n",
    "\n",
    "# calculate similarity between event pairs based on the two matrices\n",
    "eventPropSim = ss.dok_matrix((n_events, n_events))\n",
    "eventContSim = ss.dok_matrix((n_events, n_events))\n",
    "\n",
    "#读取在测试集和训练集中出现的活动对\n",
    "uniqueEventPairs = cPickle.load(open(\"PE_uniqueEventPairs.pkl\", 'rb'))\n",
    "\n",
    "for e1, e2 in uniqueEventPairs:\n",
    "    #i = eventIndex[e1]\n",
    "    #j = eventIndex[e2]\n",
    "    i = e1\n",
    "    j = e2\n",
    "    \n",
    "    #非词频特征，采用Person相关系数作为相似度\n",
    "    if not eventPropSim.has_key((i,j)):\n",
    "        epsim = ssd.correlation(eventPropMatrix.getrow(i).todense(),\n",
    "            eventPropMatrix.getrow(j).todense())\n",
    "        \n",
    "        eventPropSim[i, j] = epsim\n",
    "        eventPropSim[j, i] = epsim\n",
    "    \n",
    "    #对词频特征，采用余弦相似度，也可以用直方图交/Jacard相似度\n",
    "    if not eventContSim.has_key((i,j)):\n",
    "        ecsim = ssd.cosine(eventContMatrix.getrow(i).todense(),\n",
    "            eventContMatrix.getrow(j).todense())\n",
    "    \n",
    "        eventContSim[i, j] = epsim\n",
    "        eventContSim[j, i] = epsim\n",
    "    \n",
    "sio.mmwrite(\"EV_eventPropSim\", eventPropSim)\n",
    "sio.mmwrite(\"EV_eventContSim\", eventContSim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eventPropSim.getrow(0).todense()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
